# standard imports
import pandas as pd
import numpy as np

# Do not change this option; This allows the CodeGrade auto grading to function correctly
pd.set_option('display.max_columns', 20)


### File names and data information ###
# central.csv: list of students that attend Central High School along with their class scores
# columbia.csv: list of students that attend Columbia High School along with their class scores
# eastside.csv: list of students that attend Eastside High School along with their class scores
# greenwich.csv: list of students that attend Greenwich High School along with their class scores
# school_info.csv: information about the four local schools
# activities.csv: list of students that participate in after school activities
# principal.csv: information about the principals for all the schools in the district, not just the 4 high schools that we're analyzing


# load data from each csv files and assign new DataFrame names to each
# read_csv with iso-8859-1 encoding

central_1 = pd.read_csv('central.csv', encoding='iso-8859-1')
columbia_1 = pd.read_csv('columbia.csv', encoding='iso-8859-1')
eastside_1 = pd.read_csv('eastside.csv', encoding='iso-8859-1')
greenwich_1 = pd.read_csv('greenwich.csv', encoding='iso-8859-1')
school_info_1 = pd.read_csv('school_info.csv', encoding='iso-8859-1')
activities_1 = pd.read_csv('activities.csv', encoding='iso-8859-1')
principal_1 = pd.read_csv('principal.csv', encoding='iso-8859-1')

# Copy each to new DF names so that we can have a copy of the original imports if needed
centeral = central_1.copy()
columbia = columbia_1.copy()
eastside = eastside_1.copy()
greenwich = greenwich_1.copy()
school_info = school_info_1.copy()
activities = activities_1.copy()
principal = principal_1.copy()


print('centeral', centeral.shape)
print('columbia', columbia.shape)
print('eastside', eastside.shape)
print('greenwich', greenwich.shape)
print('school_info', school_info.shape)
print('activities', activities.shape)
print('principal', principal.shape)

centeral (1001, 6)
columbia (1038, 6)
eastside (1011, 6)
greenwich (1027, 6)
school_info (4, 4)
activities (1711, 2)
principal (20, 3)


student_final = pd.concat([centeral, columbia, eastside, greenwich], axis=0)


student_final = student_final.merge(school_info, how='outer', left_on= 'School_Name', right_on='School')


student_final = student_final.merge(activities, how='outer', left_on= 'Student_ID', right_on='ID')


student_final = student_final.merge(principal, how='inner', left_on= 'School_Name', right_on='School')


student_final.head()


student_final = student_final.drop(['School_x', 'School_y', 'ID','School_Address'], axis = 1)


student_final.head()


student_final.shape

(4077, 11)


student_final.dtypes

Student_ID             int64
School_Name           object
Math                   int64
Science                int64
English                int64
History                int64
Address               object
Mascot                object
Student_Population     int64
Activity              object
Principal_Name        object
dtype: object


student_final["Grade_Average"] = (student_final["Math"] + student_final["Science"] + student_final["English"] + student_final["History"]) / 4


student_final.head()


student_final.loc[student_final['Grade_Average'].isnull(), 'Letter_Grade'] = 'None'
student_final.loc[student_final['Grade_Average'].between(0, 59.99, 'both'), 'Letter_Grade'] = 'F'
student_final.loc[student_final['Grade_Average'].between(60, 69.99, 'both'), 'Letter_Grade'] = 'D'
student_final.loc[student_final['Grade_Average'].between(70, 79.99, 'both'), 'Letter_Grade'] = 'C'
student_final.loc[student_final['Grade_Average'].between(80, 89.99, 'both'), 'Letter_Grade'] = 'B'
student_final.loc[student_final['Grade_Average'].between(90, 100, 'both'), 'Letter_Grade'] = 'A'


student_final.head()


student_final.shape

(4077, 13)


student_final['Letter_Grade'].value_counts()

C    1993
B    1744
D     182
A     158
Name: Letter_Grade, dtype: int64


percent_missing = student_final.isnull().sum() * 100 / len(student_final) 
percent_missing = round(percent_missing, 2)
percent_missing = percent_missing.astype(str) + '%'
percent_missing


dup_percentage = student_final.duplicated().sum()/len(student_final)*100
dup_percentage = round(dup_percentage,2)
dup_percentage = dup_percentage.astype(str) + '%'
dup_percentage


student_final['Activity'] = student_final['Activity'].fillna('None')


percent_missing = student_final.isnull().sum() * 100 / len(student_final) 
percent_missing = round(percent_missing, 2)
percent_missing = percent_missing.astype(str) + '%'
percent_missing


students_final = student_final[['Student_ID', 'Math', 'Science', 'English', 'History','Grade_Average','Letter_Grade','Activity'
                               ,'School_Name','Address','Principal_Name','Mascot','Student_Population']]


# After all merging/combinning completed, we can check with this function by the order list of column names for student_final

for col in students_final.columns:
    print(col)

Student_ID
Math
Science
English
History
Grade_Average
Letter_Grade
Activity
School_Name
Address
Principal_Name
Mascot
Student_Population


students_final.dtypes

Student_ID              int64
Math                    int64
Science                 int64
English                 int64
History                 int64
Grade_Average         float64
Letter_Grade           object
Activity               object
School_Name            object
Address                object
Principal_Name         object
Mascot                 object
Student_Population      int64
dtype: object


students_final["Letter_Grade"] = students_final["Letter_Grade"].astype("category")


students_final.dtypes

Student_ID               int64
Math                     int64
Science                  int64
English                  int64
History                  int64
Grade_Average          float64
Letter_Grade          category
Activity                object
School_Name             object
Address                 object
Principal_Name          object
Mascot                  object
Student_Population       int64
dtype: object

	Student_ID	School_Name	Math	Science	English	History	School_x	Address	Mascot	Student_Population	ID	Activity	School_y	School_Address	Principal_Name
0	145581	Central	70	74	87	63	Central	100 Central High Lane	Eagle	300	NaN	NaN	Central	100 Central High Lane	Ray Smith
1	321209	Central	70	62	70	84	Central	100 Central High Lane	Eagle	300	NaN	NaN	Central	100 Central High Lane	Ray Smith
2	221982	Central	62	61	79	63	Central	100 Central High Lane	Eagle	300	NaN	NaN	Central	100 Central High Lane	Ray Smith
3	204249	Central	89	65	73	67	Central	100 Central High Lane	Eagle	300	NaN	NaN	Central	100 Central High Lane	Ray Smith
4	319950	Central	61	99	86	86	Central	100 Central High Lane	Eagle	300	319950.0	Cheer	Central	100 Central High Lane	Ray Smith

	Student_ID	School_Name	Math	Science	English	History	Address	Mascot	Student_Population	Activity	Principal_Name
0	145581	Central	70	74	87	63	100 Central High Lane	Eagle	300	NaN	Ray Smith
1	321209	Central	70	62	70	84	100 Central High Lane	Eagle	300	NaN	Ray Smith
2	221982	Central	62	61	79	63	100 Central High Lane	Eagle	300	NaN	Ray Smith
3	204249	Central	89	65	73	67	100 Central High Lane	Eagle	300	NaN	Ray Smith
4	319950	Central	61	99	86	86	100 Central High Lane	Eagle	300	Cheer	Ray Smith

	Student_ID	School_Name	Math	Science	English	History	Address	Mascot	Student_Population	Activity	Principal_Name	Grade_Average
0	145581	Central	70	74	87	63	100 Central High Lane	Eagle	300	NaN	Ray Smith	73.50
1	321209	Central	70	62	70	84	100 Central High Lane	Eagle	300	NaN	Ray Smith	71.50
2	221982	Central	62	61	79	63	100 Central High Lane	Eagle	300	NaN	Ray Smith	66.25
3	204249	Central	89	65	73	67	100 Central High Lane	Eagle	300	NaN	Ray Smith	73.50
4	319950	Central	61	99	86	86	100 Central High Lane	Eagle	300	Cheer	Ray Smith	83.00

	Student_ID	School_Name	Math	Science	English	History	Address	Mascot	Student_Population	Activity	Principal_Name	Grade_Average	Letter_Grade
0	145581	Central	70	74	87	63	100 Central High Lane	Eagle	300	NaN	Ray Smith	73.50	C
1	321209	Central	70	62	70	84	100 Central High Lane	Eagle	300	NaN	Ray Smith	71.50	C
2	221982	Central	62	61	79	63	100 Central High Lane	Eagle	300	NaN	Ray Smith	66.25	D
3	204249	Central	89	65	73	67	100 Central High Lane	Eagle	300	NaN	Ray Smith	73.50	C
4	319950	Central	61	99	86	86	100 Central High Lane	Eagle	300	Cheer	Ray Smith	83.00	B

Data Manipulation¶

Task: High School Students Merging Practice¶

Name:¶

Overview¶

Task¶

1. Load Data¶

Investigating into our Data Frames¶

2. Merge all Data Frames to get one Data Frame called: `students_final`¶

3/ Create new columns:¶

4/ Any missing values for the entire data set must be filled with the string None.¶

5/ Ensure that column names and data types match the below list and are in this exact order:¶

Column Dtype¶

Data Manipulation¶

Task: High School Students Merging Practice¶

Name:¶

Overview¶

Task¶

1. Load Data¶

Investigating into our Data Frames¶

2. Merge all Data Frames to get one Data Frame called: students_final¶

3/ Create new columns:¶

4/ Any missing values for the entire data set must be filled with the string None.¶

5/ Ensure that column names and data types match the below list and are in this exact order:¶

Column Dtype¶

2. Merge all Data Frames to get one Data Frame called: `students_final`¶